import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as plotly
import plotly.express as px
plotly.offline.init_notebook_mode()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Load the data
df = pd.read_csv('creditcard.csv')
df
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 284802 | 172786.0 | -11.881118 | 10.071785 | -9.834783 | -2.066656 | -5.364473 | -2.606837 | -4.918215 | 7.305334 | 1.914428 | ... | 0.213454 | 0.111864 | 1.014480 | -0.509348 | 1.436807 | 0.250034 | 0.943651 | 0.823731 | 0.77 | 0 |
| 284803 | 172787.0 | -0.732789 | -0.055080 | 2.035030 | -0.738589 | 0.868229 | 1.058415 | 0.024330 | 0.294869 | 0.584800 | ... | 0.214205 | 0.924384 | 0.012463 | -1.016226 | -0.606624 | -0.395255 | 0.068472 | -0.053527 | 24.79 | 0 |
| 284804 | 172788.0 | 1.919565 | -0.301254 | -3.249640 | -0.557828 | 2.630515 | 3.031260 | -0.296827 | 0.708417 | 0.432454 | ... | 0.232045 | 0.578229 | -0.037501 | 0.640134 | 0.265745 | -0.087371 | 0.004455 | -0.026561 | 67.88 | 0 |
| 284805 | 172788.0 | -0.240440 | 0.530483 | 0.702510 | 0.689799 | -0.377961 | 0.623708 | -0.686180 | 0.679145 | 0.392087 | ... | 0.265245 | 0.800049 | -0.163298 | 0.123205 | -0.569159 | 0.546668 | 0.108821 | 0.104533 | 10.00 | 0 |
| 284806 | 172792.0 | -0.533413 | -0.189733 | 0.703337 | -0.506271 | -0.012546 | -0.649617 | 1.577006 | -0.414650 | 0.486180 | ... | 0.261057 | 0.643078 | 0.376777 | 0.008797 | -0.473649 | -0.818267 | -0.002415 | 0.013649 | 217.00 | 0 |
284807 rows × 31 columns
df.describe()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 284807.000000 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | ... | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 284807.000000 | 284807.000000 |
| mean | 94813.859575 | 1.168375e-15 | 3.416908e-16 | -1.379537e-15 | 2.074095e-15 | 9.604066e-16 | 1.487313e-15 | -5.556467e-16 | 1.213481e-16 | -2.406331e-15 | ... | 1.654067e-16 | -3.568593e-16 | 2.578648e-16 | 4.473266e-15 | 5.340915e-16 | 1.683437e-15 | -3.660091e-16 | -1.227390e-16 | 88.349619 | 0.001727 |
| std | 47488.145955 | 1.958696e+00 | 1.651309e+00 | 1.516255e+00 | 1.415869e+00 | 1.380247e+00 | 1.332271e+00 | 1.237094e+00 | 1.194353e+00 | 1.098632e+00 | ... | 7.345240e-01 | 7.257016e-01 | 6.244603e-01 | 6.056471e-01 | 5.212781e-01 | 4.822270e-01 | 4.036325e-01 | 3.300833e-01 | 250.120109 | 0.041527 |
| min | 0.000000 | -5.640751e+01 | -7.271573e+01 | -4.832559e+01 | -5.683171e+00 | -1.137433e+02 | -2.616051e+01 | -4.355724e+01 | -7.321672e+01 | -1.343407e+01 | ... | -3.483038e+01 | -1.093314e+01 | -4.480774e+01 | -2.836627e+00 | -1.029540e+01 | -2.604551e+00 | -2.256568e+01 | -1.543008e+01 | 0.000000 | 0.000000 |
| 25% | 54201.500000 | -9.203734e-01 | -5.985499e-01 | -8.903648e-01 | -8.486401e-01 | -6.915971e-01 | -7.682956e-01 | -5.540759e-01 | -2.086297e-01 | -6.430976e-01 | ... | -2.283949e-01 | -5.423504e-01 | -1.618463e-01 | -3.545861e-01 | -3.171451e-01 | -3.269839e-01 | -7.083953e-02 | -5.295979e-02 | 5.600000 | 0.000000 |
| 50% | 84692.000000 | 1.810880e-02 | 6.548556e-02 | 1.798463e-01 | -1.984653e-02 | -5.433583e-02 | -2.741871e-01 | 4.010308e-02 | 2.235804e-02 | -5.142873e-02 | ... | -2.945017e-02 | 6.781943e-03 | -1.119293e-02 | 4.097606e-02 | 1.659350e-02 | -5.213911e-02 | 1.342146e-03 | 1.124383e-02 | 22.000000 | 0.000000 |
| 75% | 139320.500000 | 1.315642e+00 | 8.037239e-01 | 1.027196e+00 | 7.433413e-01 | 6.119264e-01 | 3.985649e-01 | 5.704361e-01 | 3.273459e-01 | 5.971390e-01 | ... | 1.863772e-01 | 5.285536e-01 | 1.476421e-01 | 4.395266e-01 | 3.507156e-01 | 2.409522e-01 | 9.104512e-02 | 7.827995e-02 | 77.165000 | 0.000000 |
| max | 172792.000000 | 2.454930e+00 | 2.205773e+01 | 9.382558e+00 | 1.687534e+01 | 3.480167e+01 | 7.330163e+01 | 1.205895e+02 | 2.000721e+01 | 1.559499e+01 | ... | 2.720284e+01 | 1.050309e+01 | 2.252841e+01 | 4.584549e+00 | 7.519589e+00 | 3.517346e+00 | 3.161220e+01 | 3.384781e+01 | 25691.160000 | 1.000000 |
8 rows × 31 columns
We will be using the whiskers / IQR approach to remove outliers.
Before we do, let us visualize the amount of outliers in some features.
# boxplot for v17 feature
plt.title('V17 feature before removing outliers')
sns.boxplot(x='Class', y='V17', data=df)
plt.show()
# boxplot for v14 feature
plt.title('V14 feature before removing outliers')
sns.boxplot(x='Class', y='V14', data=df)
plt.show()
#for amount
plt.title('Amount feature before removing outliers')
sns.boxplot(x='Class', y='Amount', data=df)
plt.show()
Let's look at the datatype of the features in our dataset.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB
The instruction said handle outliers for all the numerical features in the dataset, looking at the above, we will remove outliers in all our features except the target.
feature_list = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
def calculate_outliers(df):
cols = feature_list
outliers_rows = []
for col in cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
print(f'Number of outliers in {col}: {len(outliers)}, Percentage of outliers: {len(outliers) / len(df) * 100:.2f}%')
outliers_rows.extend(outliers.index.tolist())
# Get unique row indices with outliers
outliers_rows = list(set(outliers_rows))
print('\n')
print(f'Number of rows with at least one outlier: {len(outliers_rows)}, Percentage of rows with at least one outlier: {len(outliers_rows) / df.shape[0] * 100:.2f}%')
# Create boolean array indicating which rows have at least one outlier
outliers_bool = df.index.isin(outliers_rows)
return outliers_bool
iqr_outliers = calculate_outliers(df)
Number of outliers in Time: 0, Percentage of outliers: 0.00% Number of outliers in V1: 7062, Percentage of outliers: 2.48% Number of outliers in V2: 13526, Percentage of outliers: 4.75% Number of outliers in V3: 3363, Percentage of outliers: 1.18% Number of outliers in V4: 11148, Percentage of outliers: 3.91% Number of outliers in V5: 12295, Percentage of outliers: 4.32% Number of outliers in V6: 22965, Percentage of outliers: 8.06% Number of outliers in V7: 8948, Percentage of outliers: 3.14% Number of outliers in V8: 24134, Percentage of outliers: 8.47% Number of outliers in V9: 8283, Percentage of outliers: 2.91% Number of outliers in V10: 9496, Percentage of outliers: 3.33% Number of outliers in V11: 780, Percentage of outliers: 0.27% Number of outliers in V12: 15348, Percentage of outliers: 5.39% Number of outliers in V13: 3368, Percentage of outliers: 1.18% Number of outliers in V14: 14149, Percentage of outliers: 4.97% Number of outliers in V15: 2894, Percentage of outliers: 1.02% Number of outliers in V16: 8184, Percentage of outliers: 2.87% Number of outliers in V17: 7420, Percentage of outliers: 2.61% Number of outliers in V18: 7533, Percentage of outliers: 2.64% Number of outliers in V19: 10205, Percentage of outliers: 3.58% Number of outliers in V20: 27770, Percentage of outliers: 9.75% Number of outliers in V21: 14497, Percentage of outliers: 5.09% Number of outliers in V22: 1317, Percentage of outliers: 0.46% Number of outliers in V23: 18541, Percentage of outliers: 6.51% Number of outliers in V24: 4774, Percentage of outliers: 1.68% Number of outliers in V25: 5367, Percentage of outliers: 1.88% Number of outliers in V26: 5596, Percentage of outliers: 1.96% Number of outliers in V27: 39163, Percentage of outliers: 13.75% Number of outliers in V28: 30342, Percentage of outliers: 10.65% Number of outliers in Amount: 31904, Percentage of outliers: 11.20% Number of rows with at least one outlier: 138473, Percentage of rows with at least one outlier: 48.62%
no_outliers_df = df[~iqr_outliers]
no_outliers_df
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
| 5 | 2.0 | -0.425966 | 0.960523 | 1.141109 | -0.168252 | 0.420987 | -0.029728 | 0.476201 | 0.260314 | -0.568671 | ... | -0.208254 | -0.559825 | -0.026398 | -0.371427 | -0.232794 | 0.105915 | 0.253844 | 0.081080 | 3.67 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 284796 | 172780.0 | 1.884849 | -0.143540 | -0.999943 | 1.506772 | -0.035300 | -0.613638 | 0.190241 | -0.249058 | 0.666458 | ... | 0.144008 | 0.634646 | -0.042114 | -0.053206 | 0.316403 | -0.461441 | 0.018265 | -0.041068 | 60.00 | 0 |
| 284797 | 172782.0 | -0.241923 | 0.712247 | 0.399806 | -0.463406 | 0.244531 | -1.343668 | 0.929369 | -0.206210 | 0.106234 | ... | -0.228876 | -0.514376 | 0.279598 | 0.371441 | -0.559238 | 0.113144 | 0.131507 | 0.081265 | 5.49 | 0 |
| 284800 | 172784.0 | 2.039560 | -0.175233 | -1.196825 | 0.234580 | -0.008713 | -0.726571 | 0.017050 | -0.118228 | 0.435402 | ... | -0.268048 | -0.717211 | 0.297930 | -0.359769 | -0.315610 | 0.201114 | -0.080826 | -0.075071 | 2.68 | 0 |
| 284801 | 172785.0 | 0.120316 | 0.931005 | -0.546012 | -0.745097 | 1.130314 | -0.235973 | 0.812722 | 0.115093 | -0.204064 | ... | -0.314205 | -0.808520 | 0.050343 | 0.102800 | -0.435870 | 0.124079 | 0.217940 | 0.068803 | 2.69 | 0 |
| 284803 | 172787.0 | -0.732789 | -0.055080 | 2.035030 | -0.738589 | 0.868229 | 1.058415 | 0.024330 | 0.294869 | 0.584800 | ... | 0.214205 | 0.924384 | 0.012463 | -1.016226 | -0.606624 | -0.395255 | 0.068472 | -0.053527 | 24.79 | 0 |
146334 rows × 31 columns
What this means is that we have lost 48.62% of our records and are now left with 146,334 records, which is still a pretty good number.
Let's look at those records again now that we have removed most of the outliers.
# boxplot for v17 feature
plt.title('V17 feature after removing outliers')
sns.boxplot(x='Class', y='V17', data=no_outliers_df)
plt.show()
# boxplot for v14 feature
plt.title('V14 feature after removing outliers')
sns.boxplot(x='Class', y='V14', data=no_outliers_df)
plt.show()
The below is a direct quote from the kaggle dataset.
It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
Principal Component Analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. The number of principal components is less than or equal to the number of original variables. This transformation is defined in such a way that the first principal component has the largest possible variance, and each succeeding component, in turn, has the highest variance possible under the constraint that it is orthogonal to the preceding components. The resulting vectors (principal components) are an uncorrelated orthogonal basis set.
Let's see what all the technical jargons above mean for our dataset.
no_outliers_df.describe()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | ... | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 |
| mean | 95576.533109 | 0.689479 | 0.065514 | 0.151846 | 0.008564 | -0.132735 | -0.377081 | 0.011083 | 0.031417 | -0.065259 | ... | -0.037541 | -0.038136 | 0.007977 | -0.000771 | 0.026885 | -0.015648 | 0.018484 | 0.009894 | 34.895958 | 0.000103 |
| std | 46781.425189 | 1.191887 | 0.759219 | 1.179382 | 1.067758 | 0.764318 | 0.640091 | 0.680457 | 0.309031 | 0.866178 | ... | 0.242236 | 0.662272 | 0.194051 | 0.519107 | 0.422551 | 0.430137 | 0.111440 | 0.079614 | 41.822617 | 0.010124 |
| min | 0.000000 | -4.000703 | -2.629561 | -3.763535 | -3.234792 | -2.644680 | -2.501379 | -2.238746 | -1.011230 | -2.503120 | ... | -0.849287 | -2.132206 | -0.625903 | -1.544928 | -1.318690 | -1.178460 | -0.313658 | -0.249694 | 0.000000 | 0.000000 |
| 25% | 55233.250000 | -0.447437 | -0.432748 | -0.657034 | -0.726678 | -0.638769 | -0.818151 | -0.474484 | -0.176317 | -0.583642 | ... | -0.227876 | -0.583887 | -0.117911 | -0.333400 | -0.287113 | -0.314764 | -0.048077 | -0.043591 | 4.380000 | 0.000000 |
| 50% | 81701.000000 | 1.132213 | 0.051078 | 0.275632 | 0.114266 | -0.132104 | -0.440719 | 0.054474 | -0.004907 | -0.063699 | ... | -0.050773 | -0.043178 | 0.001174 | 0.035755 | 0.040798 | -0.041862 | 0.003430 | 0.009687 | 16.570000 | 0.000000 |
| 75% | 140376.000000 | 1.869928 | 0.642562 | 1.029075 | 0.691059 | 0.376857 | 0.007746 | 0.492660 | 0.196328 | 0.477281 | ... | 0.141951 | 0.459555 | 0.131522 | 0.392546 | 0.348641 | 0.203760 | 0.064287 | 0.043134 | 50.000000 | 0.000000 |
| max | 172787.000000 | 2.454930 | 2.871336 | 3.790995 | 3.131204 | 2.565293 | 2.146309 | 2.251729 | 1.131305 | 2.456673 | ... | 0.808491 | 2.118715 | 0.611675 | 1.299212 | 1.350739 | 1.092785 | 0.333860 | 0.275066 | 184.500000 | 1.000000 |
8 rows × 31 columns
If we look at the features that have had the PCA applied to them we can see that they all have very similar min and max value, compared to the Time and Amount section.
For the sake of completeness, we will normalize these two features.
# boxplot for v17 feature
plt.title('Time')
sns.boxplot(x='Class', y='Time', data=no_outliers_df)
plt.show()
# boxplot for v14 feature
plt.title('Amount')
sns.boxplot(x='Class', y='Amount', data=no_outliers_df)
plt.show()
sns.displot(no_outliers_df, x = "Amount", kde=True, bins=50, aspect=2)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Density')
plt.show()
We still have a lot of outliers in our Amount feature so this will affect what normalization technique we will use.
We will use the robust scaling method because it is least sensitive to outliers compared to min-max or zscore.
no_outliers_df_copy = no_outliers_df.copy()
scaler = RobustScaler()
no_outliers_df_copy['Time'] = scaler.fit_transform(no_outliers_df_copy['Time'].values.reshape(-1, 1))
no_outliers_df_copy['Amount'] = scaler.fit_transform(no_outliers_df_copy['Amount'].values.reshape(-1, 1))
no_outliers_df_copy
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.959577 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 2.916484 | 0 |
| 1 | -0.959577 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | -0.304253 | 0 |
| 3 | -0.959565 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 2.343928 | 0 |
| 4 | -0.959553 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 1.170978 | 0 |
| 5 | -0.959553 | -0.425966 | 0.960523 | 1.141109 | -0.168252 | 0.420987 | -0.029728 | 0.476201 | 0.260314 | -0.568671 | ... | -0.208254 | -0.559825 | -0.026398 | -0.371427 | -0.232794 | 0.105915 | 0.253844 | 0.081080 | -0.282771 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 284796 | 1.069721 | 1.884849 | -0.143540 | -0.999943 | 1.506772 | -0.035300 | -0.613638 | 0.190241 | -0.249058 | 0.666458 | ... | 0.144008 | 0.634646 | -0.042114 | -0.053206 | 0.316403 | -0.461441 | 0.018265 | -0.041068 | 0.951995 | 0 |
| 284797 | 1.069745 | -0.241923 | 0.712247 | 0.399806 | -0.463406 | 0.244531 | -1.343668 | 0.929369 | -0.206210 | 0.106234 | ... | -0.228876 | -0.514376 | 0.279598 | 0.371441 | -0.559238 | 0.113144 | 0.131507 | 0.081265 | -0.242876 | 0 |
| 284800 | 1.069768 | 2.039560 | -0.175233 | -1.196825 | 0.234580 | -0.008713 | -0.726571 | 0.017050 | -0.118228 | 0.435402 | ... | -0.268048 | -0.717211 | 0.297930 | -0.359769 | -0.315610 | 0.201114 | -0.080826 | -0.075071 | -0.304472 | 0 |
| 284801 | 1.069780 | 0.120316 | 0.931005 | -0.546012 | -0.745097 | 1.130314 | -0.235973 | 0.812722 | 0.115093 | -0.204064 | ... | -0.314205 | -0.808520 | 0.050343 | 0.102800 | -0.435870 | 0.124079 | 0.217940 | 0.068803 | -0.304253 | 0 |
| 284803 | 1.069803 | -0.732789 | -0.055080 | 2.035030 | -0.738589 | 0.868229 | 1.058415 | 0.024330 | 0.294869 | 0.584800 | ... | 0.214205 | 0.924384 | 0.012463 | -1.016226 | -0.606624 | -0.395255 | 0.068472 | -0.053527 | 0.180184 | 0 |
146334 rows × 31 columns
no_outliers_df_copy.describe()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | ... | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 | 146334.000000 |
| mean | 0.162968 | 0.689479 | 0.065514 | 0.151846 | 0.008564 | -0.132735 | -0.377081 | 0.011083 | 0.031417 | -0.065259 | ... | -0.037541 | -0.038136 | 0.007977 | -0.000771 | 0.026885 | -0.015648 | 0.018484 | 0.009894 | 0.401709 | 0.000103 |
| std | 0.549447 | 1.191887 | 0.759219 | 1.179382 | 1.067758 | 0.764318 | 0.640091 | 0.680457 | 0.309031 | 0.866178 | ... | 0.242236 | 0.662272 | 0.194051 | 0.519107 | 0.422551 | 0.430137 | 0.111440 | 0.079614 | 0.916761 | 0.010124 |
| min | -0.959577 | -4.000703 | -2.629561 | -3.763535 | -3.234792 | -2.644680 | -2.501379 | -2.238746 | -1.011230 | -2.503120 | ... | -0.849287 | -2.132206 | -0.625903 | -1.544928 | -1.318690 | -1.178460 | -0.313658 | -0.249694 | -0.363218 | 0.000000 |
| 25% | -0.310863 | -0.447437 | -0.432748 | -0.657034 | -0.726678 | -0.638769 | -0.818151 | -0.474484 | -0.176317 | -0.583642 | ... | -0.227876 | -0.583887 | -0.117911 | -0.333400 | -0.287113 | -0.314764 | -0.048077 | -0.043591 | -0.267207 | 0.000000 |
| 50% | 0.000000 | 1.132213 | 0.051078 | 0.275632 | 0.114266 | -0.132104 | -0.440719 | 0.054474 | -0.004907 | -0.063699 | ... | -0.050773 | -0.043178 | 0.001174 | 0.035755 | 0.040798 | -0.041862 | 0.003430 | 0.009687 | 0.000000 | 0.000000 |
| 75% | 0.689137 | 1.869928 | 0.642562 | 1.029075 | 0.691059 | 0.376857 | 0.007746 | 0.492660 | 0.196328 | 0.477281 | ... | 0.141951 | 0.459555 | 0.131522 | 0.392546 | 0.348641 | 0.203760 | 0.064287 | 0.043134 | 0.732793 | 0.000000 |
| max | 1.069803 | 2.454930 | 2.871336 | 3.790995 | 3.131204 | 2.565293 | 2.146309 | 2.251729 | 1.131305 | 2.456673 | ... | 0.808491 | 2.118715 | 0.611675 | 1.299212 | 1.350739 | 1.092785 | 0.333860 | 0.275066 | 3.681061 | 1.000000 |
8 rows × 31 columns
Looks good now.
cleaned_df = no_outliers_df_copy.copy()
class_distribution = cleaned_df['Class'].value_counts()
class_distribution
Class 0 146319 1 15 Name: count, dtype: int64
This is what we call a very imbalanced dataset and this will definitely affect how model's perfromance.
fig = px.pie(cleaned_df, names='Class', title='Class Distribution')
fig.show()
plt.figure(figsize=(8, 6))
sns.barplot(x=class_distribution.index, y=class_distribution)
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()
We can barely even see the representation of the psoitive class in our graphical representations.
This is usally the case in real world scenarios, mostly in fields like fraud detection or medical diagnosis where the event of interest is rare.
correlation = cleaned_df.corr()
display(round(correlation, 4))
#heatmap to visualize the correlation
plt.figure(figsize=(20, 20))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Time | 1.0000 | 0.2433 | -0.0790 | -0.5843 | -0.2062 | 0.2744 | -0.1706 | 0.1115 | -0.1860 | 0.1469 | ... | 0.0721 | 0.1124 | 0.2391 | -0.0003 | -0.3268 | -0.0553 | -0.0675 | -0.2024 | -0.0290 | -0.0048 |
| V1 | 0.2433 | 1.0000 | -0.6076 | -0.6536 | 0.1398 | -0.3027 | -0.0689 | -0.4339 | -0.4521 | 0.2883 | ... | -0.0852 | -0.0061 | 0.3593 | -0.0115 | 0.1505 | -0.0060 | -0.2669 | -0.3880 | 0.0078 | -0.0022 |
| V2 | -0.0790 | -0.6076 | 1.0000 | 0.1823 | 0.2804 | 0.5842 | -0.1425 | 0.7692 | 0.1306 | -0.0739 | ... | -0.0428 | -0.1154 | -0.2657 | 0.0413 | -0.0911 | 0.0162 | 0.2058 | 0.2990 | -0.3911 | 0.0079 |
| V3 | -0.5843 | -0.6536 | 0.1823 | 1.0000 | 0.0400 | -0.2696 | 0.3302 | -0.0865 | 0.4176 | -0.1229 | ... | -0.0075 | -0.0274 | -0.2511 | 0.0998 | 0.0022 | -0.0284 | 0.2603 | 0.3422 | -0.0024 | 0.0013 |
| V4 | -0.2062 | 0.1398 | 0.2804 | 0.0400 | 1.0000 | 0.1120 | 0.1844 | 0.1019 | 0.0484 | 0.2839 | ... | -0.0113 | -0.0382 | 0.0208 | 0.0818 | 0.1901 | -0.1013 | -0.0443 | 0.0126 | -0.0442 | 0.0095 |
| V5 | 0.2744 | -0.3027 | 0.5842 | -0.2696 | 0.1120 | 1.0000 | -0.1283 | 0.7820 | -0.1964 | -0.0433 | ... | 0.0024 | -0.0579 | -0.2347 | -0.2112 | -0.0318 | 0.0180 | 0.0135 | 0.0533 | -0.2647 | 0.0065 |
| V6 | -0.1706 | -0.0689 | -0.1425 | 0.3302 | 0.1844 | -0.1283 | 1.0000 | -0.3883 | 0.5862 | 0.1179 | ... | 0.0377 | 0.0833 | -0.1003 | -0.4003 | 0.0024 | -0.0658 | 0.1300 | 0.0225 | 0.1575 | 0.0010 |
| V7 | 0.1115 | -0.4339 | 0.7692 | -0.0865 | 0.1019 | 0.7820 | -0.3883 | 1.0000 | -0.3350 | -0.1210 | ... | -0.0445 | -0.1038 | -0.2654 | 0.0418 | -0.0263 | 0.0318 | 0.0402 | 0.1493 | -0.1172 | 0.0056 |
| V8 | -0.1860 | -0.4521 | 0.1306 | 0.4176 | 0.0484 | -0.1964 | 0.5862 | -0.3350 | 1.0000 | 0.0242 | ... | 0.0796 | 0.0530 | -0.0206 | -0.1525 | -0.1515 | -0.0206 | 0.2312 | 0.1944 | 0.0330 | -0.0010 |
| V9 | 0.1469 | 0.2883 | -0.0739 | -0.1229 | 0.2839 | -0.0433 | 0.1179 | -0.1210 | 0.0242 | 1.0000 | ... | 0.0174 | 0.0546 | 0.1690 | 0.0229 | -0.0454 | 0.0589 | -0.0420 | -0.1647 | -0.0507 | -0.0041 |
| V10 | -0.0079 | 0.4450 | -0.5536 | -0.1738 | -0.0873 | -0.4234 | 0.0838 | -0.4776 | -0.2174 | -0.4706 | ... | -0.0521 | 0.0490 | 0.1750 | -0.0477 | 0.0373 | -0.0781 | -0.0776 | -0.2171 | 0.1145 | -0.0035 |
| V11 | -0.2363 | 0.0197 | -0.0108 | 0.0740 | -0.0064 | -0.0305 | 0.0997 | -0.0556 | 0.1220 | -0.1410 | ... | 0.0489 | -0.0004 | 0.0692 | 0.1195 | 0.0131 | 0.0209 | -0.0409 | -0.0361 | -0.0250 | 0.0046 |
| V12 | 0.0322 | 0.0694 | 0.1483 | -0.0424 | 0.2686 | 0.1197 | 0.1373 | 0.0887 | 0.0918 | 0.2425 | ... | -0.0194 | -0.0219 | 0.0962 | 0.0583 | 0.0213 | 0.0043 | -0.0037 | -0.0344 | -0.0476 | -0.0034 |
| V13 | 0.0132 | 0.0571 | 0.0198 | -0.0096 | -0.0723 | 0.0247 | 0.0009 | -0.0095 | -0.2116 | -0.1469 | ... | -0.0487 | 0.0287 | 0.0030 | -0.0318 | 0.0043 | 0.0053 | 0.0088 | 0.0132 | -0.0129 | -0.0035 |
| V14 | -0.0984 | -0.0060 | 0.1206 | -0.0893 | 0.1204 | 0.0910 | -0.1211 | 0.1783 | 0.0352 | -0.0224 | ... | 0.0840 | -0.0007 | -0.0018 | -0.0424 | 0.0779 | 0.0050 | -0.1108 | -0.0572 | -0.0237 | -0.0126 |
| V15 | -0.2669 | 0.0538 | 0.0623 | 0.1119 | 0.0776 | -0.1427 | -0.1711 | -0.0468 | -0.1136 | 0.0219 | ... | 0.0515 | -0.0115 | 0.0454 | 0.0029 | 0.0434 | -0.0010 | 0.0021 | 0.0529 | -0.0528 | 0.0010 |
| V16 | 0.0050 | 0.0113 | 0.0262 | 0.0090 | -0.1122 | -0.0072 | -0.0114 | -0.1135 | 0.1031 | -0.0127 | ... | 0.2137 | 0.0381 | 0.0017 | -0.1024 | -0.0520 | 0.0085 | -0.0363 | 0.0066 | -0.0746 | 0.0063 |
| V17 | -0.1028 | -0.0210 | -0.0569 | -0.0318 | -0.0089 | -0.1654 | -0.0615 | -0.1306 | 0.0936 | -0.1441 | ... | -0.0374 | 0.0143 | 0.0493 | 0.2137 | 0.0402 | -0.0163 | 0.0519 | 0.0890 | 0.0496 | 0.0090 |
| V18 | 0.1231 | -0.0444 | -0.0095 | -0.0257 | -0.0157 | 0.0033 | 0.1187 | -0.0927 | 0.1484 | 0.0120 | ... | -0.0091 | 0.0519 | -0.1221 | -0.1144 | -0.0663 | 0.0157 | 0.0239 | 0.0149 | 0.0688 | 0.0048 |
| V19 | 0.0923 | 0.0377 | -0.0235 | -0.1318 | -0.1196 | 0.0398 | 0.0366 | 0.0009 | -0.0189 | 0.0442 | ... | 0.0092 | -0.0205 | -0.0786 | -0.1737 | -0.0139 | -0.0303 | -0.0210 | -0.0577 | -0.0246 | -0.0083 |
| V20 | -0.1935 | -0.3776 | 0.2207 | 0.3038 | 0.0058 | 0.2024 | 0.0405 | 0.2489 | 0.0395 | -0.0200 | ... | 0.2021 | 0.0808 | -0.2614 | -0.0077 | 0.0512 | 0.0354 | 0.2357 | 0.1615 | 0.2519 | 0.0000 |
| V21 | 0.0721 | -0.0852 | -0.0428 | -0.0075 | -0.0113 | 0.0024 | 0.0377 | -0.0445 | 0.0796 | 0.0174 | ... | 1.0000 | 0.9201 | -0.4082 | 0.0750 | 0.1383 | -0.0659 | -0.0100 | 0.0371 | 0.1522 | 0.0010 |
| V22 | 0.1124 | -0.0061 | -0.1154 | -0.0274 | -0.0382 | -0.0579 | 0.0833 | -0.1038 | 0.0530 | 0.0546 | ... | 0.9201 | 1.0000 | -0.4086 | 0.0733 | 0.1227 | -0.0649 | 0.0316 | -0.0260 | 0.0860 | 0.0005 |
| V23 | 0.2391 | 0.3593 | -0.2657 | -0.2511 | 0.0208 | -0.2347 | -0.1003 | -0.2654 | -0.0206 | 0.1690 | ... | -0.4082 | -0.4086 | 1.0000 | 0.2037 | -0.5813 | -0.0254 | -0.0557 | -0.1023 | -0.0177 | -0.0061 |
| V24 | -0.0003 | -0.0115 | 0.0413 | 0.0998 | 0.0818 | -0.2112 | -0.4003 | 0.0418 | -0.1525 | 0.0229 | ... | 0.0750 | 0.0733 | 0.2037 | 1.0000 | -0.0647 | 0.0169 | -0.0464 | 0.0632 | -0.0153 | 0.0003 |
| V25 | -0.3268 | 0.1505 | -0.0911 | 0.0022 | 0.1901 | -0.0318 | 0.0024 | -0.0263 | -0.1515 | -0.0454 | ... | 0.1383 | 0.1227 | -0.5813 | -0.0647 | 1.0000 | -0.0603 | -0.1731 | -0.1460 | 0.0352 | 0.0037 |
| V26 | -0.0553 | -0.0060 | 0.0162 | -0.0284 | -0.1013 | 0.0180 | -0.0658 | 0.0318 | -0.0206 | 0.0589 | ... | -0.0659 | -0.0649 | -0.0254 | 0.0169 | -0.0603 | 1.0000 | -0.2055 | -0.0431 | -0.0372 | -0.0032 |
| V27 | -0.0675 | -0.2669 | 0.2058 | 0.2603 | -0.0443 | 0.0135 | 0.1300 | 0.0402 | 0.2312 | -0.0420 | ... | -0.0100 | 0.0316 | -0.0557 | -0.0464 | -0.1731 | -0.2055 | 1.0000 | 0.5737 | -0.1216 | -0.0029 |
| V28 | -0.2024 | -0.3880 | 0.2990 | 0.3422 | 0.0126 | 0.0533 | 0.0225 | 0.1493 | 0.1944 | -0.1647 | ... | 0.0371 | -0.0260 | -0.1023 | 0.0632 | -0.1460 | -0.0431 | 0.5737 | 1.0000 | -0.0168 | -0.0006 |
| Amount | -0.0290 | 0.0078 | -0.3911 | -0.0024 | -0.0442 | -0.2647 | 0.1575 | -0.1172 | 0.0330 | -0.0507 | ... | 0.1522 | 0.0860 | -0.0177 | -0.0153 | 0.0352 | -0.0372 | -0.1216 | -0.0168 | 1.0000 | -0.0023 |
| Class | -0.0048 | -0.0022 | 0.0079 | 0.0013 | 0.0095 | 0.0065 | 0.0010 | 0.0056 | -0.0010 | -0.0041 | ... | 0.0010 | 0.0005 | -0.0061 | 0.0003 | 0.0037 | -0.0032 | -0.0029 | -0.0006 | -0.0023 | 1.0000 |
31 rows × 31 columns
dfc = correlation['Class'].sort_values(ascending=False)
dfc = pd.DataFrame(dfc)
dfc
| Class | |
|---|---|
| Class | 1.000000 |
| V4 | 0.009452 |
| V17 | 0.008973 |
| V2 | 0.007927 |
| V5 | 0.006523 |
| V16 | 0.006280 |
| V7 | 0.005647 |
| V18 | 0.004756 |
| V11 | 0.004570 |
| V25 | 0.003670 |
| V3 | 0.001260 |
| V21 | 0.001031 |
| V15 | 0.000998 |
| V6 | 0.000990 |
| V22 | 0.000456 |
| V24 | 0.000291 |
| V20 | 0.000007 |
| V28 | -0.000618 |
| V8 | -0.001003 |
| V1 | -0.002218 |
| Amount | -0.002288 |
| V27 | -0.002895 |
| V26 | -0.003176 |
| V12 | -0.003404 |
| V13 | -0.003517 |
| V10 | -0.003546 |
| V9 | -0.004078 |
| Time | -0.004837 |
| V23 | -0.006103 |
| V19 | -0.008324 |
| V14 | -0.012567 |
From the table above, we can see the features that have the most correlation with our target class.
dfc.iloc[1:11]
| Class | |
|---|---|
| V4 | 0.009452 |
| V17 | 0.008973 |
| V2 | 0.007927 |
| V5 | 0.006523 |
| V16 | 0.006280 |
| V7 | 0.005647 |
| V18 | 0.004756 |
| V11 | 0.004570 |
| V25 | 0.003670 |
| V3 | 0.001260 |
This is our top ten positively correlated features with our target values. The word correlated is used pretty loosely here because these values are very low. What this means is that a single feature is not enough to properly predict our fraud/n-fraud class.
Let's visualize this below.
correlations = cleaned_df.corr()['Class'].sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=correlations.values, y=correlations.index)
plt.title('Feature Correlations with Target')
plt.show()
# top 10 correlated features with the target
top_10_correlated = correlations[1:11]
display(top_10_correlated)
# names of the top 10 correlated features
top_10_correlated_features = top_10_correlated.index
top_10_correlated_features
V4 0.009452 V17 0.008973 V2 0.007927 V5 0.006523 V16 0.006280 V7 0.005647 V18 0.004756 V11 0.004570 V25 0.003670 V3 0.001260 Name: Class, dtype: float64
Index(['V4', 'V17', 'V2', 'V5', 'V16', 'V7', 'V18', 'V11', 'V25', 'V3'], dtype='object')
fig, axes = plt.subplots(5, 2, figsize=(20, 30))
fig.suptitle('Top 10 Correlated Features with Target')
for i, feature in enumerate(top_10_correlated_features):
sns.boxplot(ax=axes[i // 2, i % 2], x='Class', y=feature
, data=cleaned_df, showfliers=True)
axes[i // 2, i % 2].set_title(f'{feature} Distribution')
axes[i // 2, i % 2].set_xlabel('Class')
axes[i // 2, i % 2].set_ylabel(feature)
plt.show()
I will choose to only train our model using the top 10 correlated features.
X = cleaned_df.drop('Class', axis=1)
y = cleaned_df['Class']
X
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.959577 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 2.916484 |
| 1 | -0.959577 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | -0.304253 |
| 3 | -0.959565 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.208038 | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 2.343928 |
| 4 | -0.959553 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | 0.408542 | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 1.170978 |
| 5 | -0.959553 | -0.425966 | 0.960523 | 1.141109 | -0.168252 | 0.420987 | -0.029728 | 0.476201 | 0.260314 | -0.568671 | ... | 0.084968 | -0.208254 | -0.559825 | -0.026398 | -0.371427 | -0.232794 | 0.105915 | 0.253844 | 0.081080 | -0.282771 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 284796 | 1.069721 | 1.884849 | -0.143540 | -0.999943 | 1.506772 | -0.035300 | -0.613638 | 0.190241 | -0.249058 | 0.666458 | ... | -0.153997 | 0.144008 | 0.634646 | -0.042114 | -0.053206 | 0.316403 | -0.461441 | 0.018265 | -0.041068 | 0.951995 |
| 284797 | 1.069745 | -0.241923 | 0.712247 | 0.399806 | -0.463406 | 0.244531 | -1.343668 | 0.929369 | -0.206210 | 0.106234 | ... | -0.139512 | -0.228876 | -0.514376 | 0.279598 | 0.371441 | -0.559238 | 0.113144 | 0.131507 | 0.081265 | -0.242876 |
| 284800 | 1.069768 | 2.039560 | -0.175233 | -1.196825 | 0.234580 | -0.008713 | -0.726571 | 0.017050 | -0.118228 | 0.435402 | ... | -0.256922 | -0.268048 | -0.717211 | 0.297930 | -0.359769 | -0.315610 | 0.201114 | -0.080826 | -0.075071 | -0.304472 |
| 284801 | 1.069780 | 0.120316 | 0.931005 | -0.546012 | -0.745097 | 1.130314 | -0.235973 | 0.812722 | 0.115093 | -0.204064 | ... | 0.000676 | -0.314205 | -0.808520 | 0.050343 | 0.102800 | -0.435870 | 0.124079 | 0.217940 | 0.068803 | -0.304253 |
| 284803 | 1.069803 | -0.732789 | -0.055080 | 2.035030 | -0.738589 | 0.868229 | 1.058415 | 0.024330 | 0.294869 | 0.584800 | ... | 0.059616 | 0.214205 | 0.924384 | 0.012463 | -1.016226 | -0.606624 | -0.395255 | 0.068472 | -0.053527 | 0.180184 |
146334 rows × 30 columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((117067, 30), (29267, 30), (117067,), (29267,))
model = LogisticRegression(max_iter=1000)
scores = cross_val_score(model, X_train, y_train, cv=5)
model.fit(X_train, y_train)
LogisticRegression(max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000)
print(model.coef_)
[[-0.74766799 0.28960949 0.44083742 0.36202213 0.23096235 -0.08531678 0.25782693 0.30432583 -0.21070329 0.1894693 -0.53779737 0.58322328 -0.36558853 -0.40378273 -1.56695142 0.65485352 0.71317105 0.76487635 0.37601602 -0.14127806 -0.11411718 -0.12207981 -0.19829875 -0.36901884 -0.19863475 0.29735889 -0.25192101 -0.0914607 -0.09562239 0.36756322]]
feature_names = X_train.columns
coefs=model.coef_[0]
top_ten = np.argpartition(coefs, -10)[-10:]
top_ten
print(feature_names[top_ten])
Index(['V25', 'Amount', 'V18', 'V3', 'V7', 'V2', 'V17', 'V16', 'V11', 'V15'], dtype='object')
The values above shows the weight that the model gave each feature. We have 30 values because we have 30 features in our dataset.
A positive coefficient increases the log-odds of the output, and thus increases the probability of the positive class. A negative coefficient decreases the log-odds of the output, and thus decreases the probability of the positive class.
The magnitude of a coefficient indicates the importance of the corresponding feature. A larger absolute value of a coefficient means that the feature is more important for the prediction. The sign of a coefficient indicates the direction of the relationship between the feature and the output. A positive coefficient means that the feature and the output are positively correlated, while a negative coefficient means that they are negatively correlated.
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': model.coef_[0]})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
feature_importance
| Feature | Importance | |
|---|---|---|
| 17 | V17 | 0.764876 |
| 16 | V16 | 0.713171 |
| 15 | V15 | 0.654854 |
| 11 | V11 | 0.583223 |
| 2 | V2 | 0.440837 |
| 18 | V18 | 0.376016 |
| 29 | Amount | 0.367563 |
| 3 | V3 | 0.362022 |
| 7 | V7 | 0.304326 |
| 25 | V25 | 0.297359 |
| 1 | V1 | 0.289609 |
| 6 | V6 | 0.257827 |
| 4 | V4 | 0.230962 |
| 9 | V9 | 0.189469 |
| 5 | V5 | -0.085317 |
| 27 | V27 | -0.091461 |
| 28 | V28 | -0.095622 |
| 20 | V20 | -0.114117 |
| 21 | V21 | -0.122080 |
| 19 | V19 | -0.141278 |
| 22 | V22 | -0.198299 |
| 24 | V24 | -0.198635 |
| 8 | V8 | -0.210703 |
| 26 | V26 | -0.251921 |
| 12 | V12 | -0.365589 |
| 23 | V23 | -0.369019 |
| 13 | V13 | -0.403783 |
| 10 | V10 | -0.537797 |
| 0 | Time | -0.747668 |
| 14 | V14 | -1.566951 |
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
Accuracy: 0.9998291591211945 Precision: 0.0 Recall: 0.0 F1-Score: 0.0
y_pred_proba = model.predict_proba(X_test)[:, 1].ravel() # probabilities for the positive outcome
y_pred = model.predict(X_test).ravel() # predicted classes
# Create a DataFrame
df_predictions = pd.DataFrame({
'probability': y_pred_proba,
'prediction': y_pred,
'ground_truth': y_test.values.flatten()
})
# Filter the DataFrame to only include instances where the ground truth was 1
df_predictions = df_predictions[df_predictions['ground_truth'] == 1]
df_predictions
| probability | prediction | ground_truth | |
|---|---|---|---|
| 2849 | 2.837467e-03 | 0 | 1 |
| 13983 | 3.925579e-07 | 0 | 1 |
| 15391 | 6.250653e-04 | 0 | 1 |
| 15487 | 2.360204e-06 | 0 | 1 |
| 27704 | 1.599608e-05 | 0 | 1 |
We can see why the precision and recall is 0. Failed all positive test cases.
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr)
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
print("AUC:", roc_auc_score(y_test, y_pred_proba))
AUC: 0.543428337092475
The AUC being 0.54 means that the model has no class separation capacity and is only as good as a random guessing.
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
model.fit(X_train_res, y_train_res)
y_pred_res = model.predict(X_test_res)
print("Accuracy:", accuracy_score(y_test_res, y_pred_res))
print("Precision:", precision_score(y_test_res, y_pred_res, zero_division=0))
print("Recall:", recall_score(y_test_res, y_pred_res))
print("F1-Score:", f1_score(y_test_res, y_pred_res))
Accuracy: 0.96311167304538 Precision: 0.9316555872111427 Recall: 1.0 F1-Score: 0.964618737811573
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Precision:", precision_score(y_test, y_pred_knn, zero_division=0))
print("KNN Recall:", recall_score(y_test, y_pred_knn))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Precision:", precision_score(y_test, y_pred_svm, zero_division=0))
print("SVM Recall:", recall_score(y_test, y_pred_svm))
KNN Accuracy: 0.9998291591211945 KNN Precision: 0.0 KNN Recall: 0.0 SVM Accuracy: 0.9998291591211945 SVM Precision: 0.0 SVM Recall: 0.0
We see here that the KNN and SVM scored the exact same acuuracy score as our logistic regression.
They both also have a 0% precision and recall score. Showing exactly how unbalanced the dataset is for model training.